Based on the analysis conducted in this report, the unemployment rate is forecasted to be 3.79% and total nonfarm payroll employment is expected to increase by 170,000 in April 2019. This report will explore various forecasting methods and explain the choices that were made in generating the above forecast estimates.
suppressMessages(library(fredr))
suppressMessages(library(fpp2))
suppressMessages(library(dynlm))
suppressMessages(library(quantreg))
suppressMessages(library(ggplot2))
suppressMessages(library(vars))
suppressMessages(library(tseries))
suppressMessages(library(forecast))
suppressMessages(library(glmnet))
suppressMessages(library(rstanarm))
suppressMessages(library(rstan))
suppressMessages(library(prophet))
suppressMessages(library(rpart))
suppressMessages(library(randomForest))
suppressMessages(library(xgboost))
suppressMessages(library(kernlab))
suppressMessages(library(caret))
suppressMessages(library(plyr))
suppressMessages(library(dplyr))
suppressMessages(library(knitr))
suppressMessages(library(kableExtra))
suppressMessages(library(tseries))
suppressMessages(library(rugarch))
suppressMessages(library(rstan))
suppressMessages(library(loo))
suppressMessages(library(bridgesampling))
suppressMessages(library(mgcv))
suppressMessages(library(gridExtra))
fredr_set_key("8782f247febb41f291821950cf9118b6")
UNRATE <- fredr(series_id = "UNRATE")
PAYEMS <- fredr(series_id = "PAYEMS",units="chg")
unrate = ts(UNRATE[,3], start = c(1948,1), frequency = 12)
autoplot(unrate) + labs(x = "Date", y = "Unemployment Rate (%)", title = "U.S Unemployment Rate (1948 - 2019)") + theme(legend.position = "none")
payems = ts(PAYEMS[,3], start = c(1939,1), frequency = 12)
autoplot(payems) + labs(x = "Date", y = "Change in Employment (100s of thousands)", title = "Change in Total U.S Nonfarm Payroll Employment Level (1939 - 2019)") + theme(legend.position = "none") + theme(plot.title = element_text(size=12))
Autoregressive Forecasts using Empirical Risk Minimization
# Unrate
# Forecasting using AR(1-5) (Square Loss)
unrate_AR1_selection = Arima(unrate,order=c(1,0,0),include.mean=FALSE)
(unrate_AR1 = forecast(unrate_AR1_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.796386 3.528925 4.063848 3.387339 4.205434
## May 2019 3.792776 3.414708 4.170844 3.214571 4.370981
## Jun 2019 3.789170 3.326353 4.251986 3.081353 4.496987
## Jul 2019 3.785566 3.251406 4.319727 2.968638 4.602495
unrate_AR2_selection = Arima(unrate,order=c(2,0,0),include.mean=FALSE)
(unrate_AR2 = forecast(unrate_AR2_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.796318 3.530569 4.062067 3.389890 4.202746
## May 2019 3.792207 3.393852 4.190561 3.182976 4.401438
## Jun 2019 3.788049 3.289295 4.286803 3.025270 4.550828
## Jul 2019 3.783890 3.201749 4.366031 2.893582 4.674197
unrate_AR3_selection = Arima(unrate,order=c(3,0,0),include.mean=FALSE)
(unrate_AR3 = forecast(unrate_AR3_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.740787 3.485067 3.996506 3.349697 4.131876
## May 2019 3.731364 3.354080 4.108648 3.154358 4.308370
## Jun 2019 3.710003 3.196038 4.223968 2.923962 4.496044
## Jul 2019 3.701287 3.073373 4.329200 2.740976 4.661597
unrate_AR4_selection = Arima(unrate,order=c(4,0,0),include.mean=FALSE)
(unrate_AR4 = forecast(unrate_AR4_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.760875 3.509035 4.012714 3.375720 4.146030
## May 2019 3.719155 3.356548 4.081762 3.164595 4.273715
## Jun 2019 3.702590 3.214905 4.190276 2.956740 4.448440
## Jul 2019 3.679400 3.064237 4.294563 2.738589 4.620211
unrate_AR5_selection = Arima(unrate,order=c(5,0,0),include.mean=FALSE)
(unrate_AR5 = forecast(unrate_AR5_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.782527 3.531511 4.033544 3.398631 4.166424
## May 2019 3.750936 3.392410 4.109461 3.202618 4.299253
## Jun 2019 3.723374 3.246172 4.200576 2.993557 4.453191
## Jul 2019 3.707142 3.109048 4.305235 2.792436 4.621847
# Evaluation for Square Loss Methods for Unrate
summary(unrate_AR1_selection)
## Series: unrate
## ARIMA(1,0,0) with zero mean
##
## Coefficients:
## ar1
## 0.9990
## s.e. 0.0009
##
## sigma^2 estimated as 0.04356: log likelihood=123.83
## AIC=-243.67 AICc=-243.65 BIC=-234.16
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.006110933 0.2085793 0.146867 0.04154946 2.671476 0.1693924
## ACF1
## Training set 0.1184901
summary(unrate_AR2_selection)
## Series: unrate
## ARIMA(2,0,0) with zero mean
##
## Coefficients:
## ar1 ar2
## 1.1167 -0.1176
## s.e. 0.0340 0.0340
##
## sigma^2 estimated as 0.043: log likelihood=129.77
## AIC=-253.53 AICc=-253.5 BIC=-239.28
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.006146492 0.2071224 0.1477288 0.05177387 2.695145 0.1703863
## ACF1
## Training set -0.03136932
summary(unrate_AR3_selection)
## Series: unrate
## ARIMA(3,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3
## 1.0848 0.1881 -0.2741
## s.e. 0.0329 0.0492 0.0330
##
## sigma^2 estimated as 0.03982: log likelihood=163.01
## AIC=-318.02 AICc=-317.97 BIC=-299.02
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.007006931 0.1991887 0.1446792 0.09418484 2.662406 0.166869
## ACF1
## Training set -0.0482898
summary(unrate_AR4_selection)
## Series: unrate
## ARIMA(4,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4
## 1.0359 0.2218 -0.0824 -0.1767
## s.e. 0.0337 0.0489 0.0489 0.0337
##
## sigma^2 estimated as 0.03862: log likelihood=176.5
## AIC=-343.01 AICc=-342.94 BIC=-319.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.007759897 0.196051 0.1424413 0.1254265 2.620506 0.1642879
## ACF1
## Training set -0.01521411
summary(unrate_AR5_selection)
## Series: unrate
## ARIMA(5,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5
## 1.0198 0.2146 -0.0620 -0.0860 -0.0879
## s.e. 0.0342 0.0488 0.0494 0.0488 0.0343
##
## sigma^2 estimated as 0.03836: log likelihood=179.77
## AIC=-347.54 AICc=-347.45 BIC=-319.04
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0.008157459 0.1952956 0.1410063 0.1416879 2.597137 0.1626329
## ACF1
## Training set -0.008714
checkresiduals(unrate_AR1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with zero mean
## Q* = 252.7, df = 23, p-value < 2.2e-16
##
## Model df: 1. Total lags used: 24
checkresiduals(unrate_AR2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0) with zero mean
## Q* = 181.29, df = 22, p-value < 2.2e-16
##
## Model df: 2. Total lags used: 24
checkresiduals(unrate_AR3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,0) with zero mean
## Q* = 85.417, df = 21, p-value = 9.847e-10
##
## Model df: 3. Total lags used: 24
checkresiduals(unrate_AR4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,0) with zero mean
## Q* = 66.018, df = 20, p-value = 8.027e-07
##
## Model df: 4. Total lags used: 24
checkresiduals(unrate_AR5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0) with zero mean
## Q* = 64.413, df = 19, p-value = 7.604e-07
##
## Model df: 5. Total lags used: 24
#Plotting Forecasts
autoplot(subset(unrate, start = 800), series = "Observed") + autolayer(unrate_AR1, series = "AR(1)", PI = FALSE) + autolayer(unrate_AR2, series = "AR(2)", PI = FALSE) + autolayer(unrate_AR3, series = "AR(3)", PI = FALSE) + autolayer(unrate_AR4, series = "AR(4)", PI = FALSE) + autolayer(unrate_AR5, series = "AR(5)", PI = FALSE) + ggtitle("Forecasts for Unemployment Rate") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Unemployment Rate (%)")
# Payems
# Forecasting using AR(1-5) (Square Loss)
payems_AR1_selection = Arima(payems,order=c(1,0,0),include.mean=FALSE)
(payems_AR1 = forecast(payems_AR1_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 124.85988 -129.2481 378.9678 -263.7647 513.4844
## May 2019 79.54076 -221.7481 380.8297 -381.2408 540.3224
## Jun 2019 50.67066 -267.7774 369.1187 -436.3536 537.6950
## Jul 2019 32.27925 -292.8742 357.4326 -465.0000 529.5585
payems_AR2_selection = Arima(payems,order=c(2,0,0),include.mean=FALSE)
(payems_AR2 = forecast(payems_AR2_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 88.02816 -144.8883 320.9447 -268.1869 444.2432
## May 2019 112.06959 -137.2391 361.3783 -269.2151 493.3543
## Jun 2019 78.02065 -201.8558 357.8971 -350.0134 506.0547
## Jul 2019 74.64842 -217.6029 366.8998 -372.3114 521.6083
payems_AR3_selection = Arima(payems,order=c(3,0,0),include.mean=FALSE)
(payems_AR3 = forecast(payems_AR3_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 133.81502 -94.15162 361.7817 -214.8299 482.4599
## May 2019 109.66524 -128.23537 347.5658 -254.1723 473.5028
## Jun 2019 116.32820 -139.26567 371.9221 -274.5689 507.2253
## Jul 2019 97.66526 -175.72039 371.0509 -320.4420 515.7725
payems_AR4_selection = Arima(payems,order=c(4,0,0),include.mean=FALSE)
(payems_AR4 = forecast(payems_AR4_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 142.4935 -84.31657 369.3035 -204.3826 489.3695
## May 2019 134.4093 -100.92589 369.7445 -225.5048 494.3235
## Jun 2019 116.0046 -133.35616 365.3654 -265.3598 497.3690
## Jul 2019 116.3982 -145.68936 378.4857 -284.4301 517.2265
payems_AR5_selection = Arima(payems,order=c(5,0,0),include.mean=FALSE)
(payems_AR5 = forecast(payems_AR5_selection, h = 4))
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 144.1258 -81.67221 369.9239 -201.2025 489.4541
## May 2019 142.9635 -90.70492 376.6320 -214.4015 500.3286
## Jun 2019 139.3661 -106.62747 385.3596 -236.8486 515.5807
## Jul 2019 115.3776 -140.34509 371.1002 -275.7165 506.4716
# Evaluation for Square Loss Methods for Payems
summary(payems_AR1_selection)
## Series: payems
## ARIMA(1,0,0) with zero mean
##
## Coefficients:
## ar1
## 0.6370
## s.e. 0.0248
##
## sigma^2 estimated as 39316: log likelihood=-6453.46
## AIC=12910.92 AICc=12910.93 BIC=12920.65
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 45.7002 198.1784 132.692 Inf Inf 0.6262197 -0.324929
summary(payems_AR2_selection)
## Series: payems
## ARIMA(2,0,0) with zero mean
##
## Coefficients:
## ar1 ar2
## 0.3817 0.4003
## s.e. 0.0295 0.0295
##
## sigma^2 estimated as 33032: log likelihood=-6369.36
## AIC=12744.72 AICc=12744.75 BIC=12759.33
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 27.45412 181.5567 118.8735 Inf Inf 0.5610055 -0.1079646
summary(payems_AR3_selection)
## Series: payems
## ARIMA(3,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3
## 0.2984 0.3208 0.2075
## s.e. 0.0315 0.0313 0.0316
##
## sigma^2 estimated as 31642: log likelihood=-6348.26
## AIC=12704.52 AICc=12704.56 BIC=12724
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 21.8769 177.6057 117.1603 Inf Inf 0.5529201 -0.03705291
summary(payems_AR4_selection)
## Series: payems
## ARIMA(4,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4
## 0.2767 0.2869 0.1757 0.1056
## s.e. 0.0320 0.0328 0.0329 0.0321
##
## sigma^2 estimated as 31322: log likelihood=-6342.89
## AIC=12695.78 AICc=12695.84 BIC=12720.12
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 19.62167 176.6125 116.3833 Inf Inf 0.5492534 -0.02281626
summary(payems_AR5_selection)
## Series: payems
## ARIMA(5,0,0) with zero mean
##
## Coefficients:
## ar1 ar2 ar3 ar4 ar5
## 0.2663 0.2696 0.1469 0.0778 0.0996
## s.e. 0.0320 0.0331 0.0340 0.0332 0.0321
##
## sigma^2 estimated as 31043: log likelihood=-6338.11
## AIC=12688.22 AICc=12688.3 BIC=12717.43
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 17.72833 175.7327 116.5471 Inf Inf 0.5500264 -0.003355769
checkresiduals(payems_AR1)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0) with zero mean
## Q* = 168.52, df = 23, p-value < 2.2e-16
##
## Model df: 1. Total lags used: 24
checkresiduals(payems_AR2)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,0) with zero mean
## Q* = 69.617, df = 22, p-value = 7.599e-07
##
## Model df: 2. Total lags used: 24
checkresiduals(payems_AR3)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,0,0) with zero mean
## Q* = 44.922, df = 21, p-value = 0.001773
##
## Model df: 3. Total lags used: 24
checkresiduals(payems_AR4)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(4,0,0) with zero mean
## Q* = 34.234, df = 20, p-value = 0.02459
##
## Model df: 4. Total lags used: 24
checkresiduals(payems_AR5)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(5,0,0) with zero mean
## Q* = 26.895, df = 19, p-value = 0.1071
##
## Model df: 5. Total lags used: 24
#Plotting Forecasts
autoplot(subset(payems, start = 800), series = "Observed") + autolayer(payems_AR1, series = "AR(1)", PI = FALSE) + autolayer(payems_AR2, series = "AR(2)", PI = FALSE) + autolayer(payems_AR3, series = "AR(3)", PI = FALSE) + autolayer(payems_AR4, series = "AR(4)", PI = FALSE) + autolayer(payems_AR5, series = "AR(5)", PI = FALSE) + ggtitle("Forecasts for Changes in Nonfarm Payroll Employment") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)")
Residual plots look similar for all methods; however, in an effort to be more selective, I will remove AR(1) and AR(2) forecasts from my final average, as their RMSE values are slightly worse than the other methods.
Vector Autoregressions Forecasts Incorporating Additional Variables
Below I created vector autoregressions jointly in unrate, payems, prices of the NASDAQ (Fred Series: NASDAQCOM), US Retail Gas Prices (Fred Series: GASREGW), Consumer Price Index (Fred Series: CPIAUCSL), and the effective Federal Funds Rate (Fred Series: EFFR). I decided to choose these variables after conducting some independent research online on economic that have been found to be linked to employment figures. The following research study was used in my decision-making process: https://pdfs.semanticscholar.org/0ab6/643904b461588428735b0b8c4b8e686bab35.pdf.
I found that all the series’ I added had issues with stationarity and dependence. I discovered the stationarity issue by using the Augmented Dickey-Fuller Test, which tests the null hypothesis of no stationarity. I adjusted the time series’ by using the diff function as shown below - this improved the stationarity and dependency issues.
#Loading NASDAQ, GAS prices, CPI, and FFR data
NASDAQ = read.csv("NASDAQCOM.csv")
nasdaq = ts(NASDAQ[,2], start = c(1971,3), frequency = 12)
ggseasonplot(nasdaq)
adf.test(nasdaq)
##
## Augmented Dickey-Fuller Test
##
## data: nasdaq
## Dickey-Fuller = -0.69033, Lag order = 8, p-value = 0.9714
## alternative hypothesis: stationary
nasdaq = diff(nasdaq, differences = ndiffs(nasdaq))
adf.test(nasdaq)
## Warning in adf.test(nasdaq): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: nasdaq
## Dickey-Fuller = -13.733, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
acf(nasdaq)
GAS = read.csv("GASREGW.csv")
gas = ts(GAS[,2], start = c(1990,9), frequency = 12)
ggseasonplot(gas)
adf.test(gas)
##
## Augmented Dickey-Fuller Test
##
## data: gas
## Dickey-Fuller = -2.2635, Lag order = 6, p-value = 0.4656
## alternative hypothesis: stationary
gas = diff(gas, differences = ndiffs(gas))
adf.test(gas)
## Warning in adf.test(gas): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: gas
## Dickey-Fuller = -9.5982, Lag order = 6, p-value = 0.01
## alternative hypothesis: stationary
acf(gas)
CPI = read.csv("CPIAUCSL.csv")
cpi = ts(CPI[,2], start = c(1947,1), frequency = 12)
ggseasonplot(cpi)
adf.test(cpi)
##
## Augmented Dickey-Fuller Test
##
## data: cpi
## Dickey-Fuller = -2.8571, Lag order = 9, p-value = 0.2155
## alternative hypothesis: stationary
cpi = diff(cpi, differences = ndiffs(cpi))
adf.test(cpi)
## Warning in adf.test(cpi): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: cpi
## Dickey-Fuller = -16.481, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
acf(cpi)
FFR = read.csv("EFFR.csv")
ffr = ts(FFR[,2], start = c(2000,7), frequency = 12)
ggseasonplot(ffr)
adf.test(ffr)
##
## Augmented Dickey-Fuller Test
##
## data: ffr
## Dickey-Fuller = -2.9627, Lag order = 6, p-value = 0.1716
## alternative hypothesis: stationary
ffr = diff(ffr, differences = ndiffs(ffr))
adf.test(nasdaq)
## Warning in adf.test(nasdaq): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: nasdaq
## Dickey-Fuller = -13.733, Lag order = 8, p-value = 0.01
## alternative hypothesis: stationary
acf(nasdaq)
df_2 = na.omit(cbind(unrate, payems, nasdaq, gas, cpi, ffr))
colnames(df_2) = c("unrate", "payems", "nasdaq", "gas", "cpi", "ffr")
var.1 = VAR(df_2, p = 1)
(var.1_forecast = forecast(var.1,4))
## unrate
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.784548 3.612656 3.956440 3.521661 4.047435
## May 2019 3.776178 3.516622 4.035735 3.379221 4.173136
## Jun 2019 3.773278 3.428418 4.118138 3.245860 4.300696
## Jul 2019 3.779829 3.348421 4.211237 3.120047 4.439610
##
## payems
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 170.5150 3.829352 337.2007 -84.40871 425.4388
## May 2019 166.8042 -47.666709 381.2750 -161.20073 494.8091
## Jun 2019 152.3719 -88.867382 393.6113 -216.57177 521.3157
## Jul 2019 140.0371 -117.730368 397.8045 -254.18422 534.2584
##
## nasdaq
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 48.7074963 -158.7602 256.1752 -268.5870 366.0020
## May 2019 -20.0349777 -243.2204 203.1504 -361.3676 321.2976
## Jun 2019 5.1537569 -218.5821 228.8896 -337.0207 347.3282
## Jul 2019 0.5525415 -223.3046 224.4097 -341.8074 342.9125
##
## gas
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.0581049634 -0.1262646 0.2424745 -0.2238639 0.3400739
## May 2019 0.0123192813 -0.2022523 0.2268908 -0.3158396 0.3404781
## Jun 2019 -0.0007349428 -0.2177075 0.2162376 -0.3325659 0.3310960
## Jul 2019 -0.0050158078 -0.2222120 0.2121803 -0.3371886 0.3271570
##
## cpi
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -0.47545204 -1.2466688 0.2957647 -1.654926 0.7040222
## May 2019 -0.12631997 -0.9520481 0.6994082 -1.389162 1.1365222
## Jun 2019 -0.03702156 -0.8948902 0.8208471 -1.349018 1.2749753
## Jul 2019 -0.01691086 -0.8768941 0.8430724 -1.332142 1.2983200
##
## ffr
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -0.052188405 -0.2149164 0.1105395 -0.3010593 0.1966825
## May 2019 0.010611978 -0.1592988 0.1805228 -0.2492442 0.2704681
## Jun 2019 -0.012254638 -0.1831584 0.1586491 -0.2736294 0.2491201
## Jul 2019 -0.003828483 -0.1748858 0.1672288 -0.2654380 0.2577811
var.2 = VAR(df_2, p = 2)
(var.2_forecast = forecast(var.2,4))
## unrate
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.819508 3.656120 3.982896 3.569627 4.069389
## May 2019 3.817063 3.600182 4.033943 3.485373 4.148753
## Jun 2019 3.855172 3.573062 4.137282 3.423723 4.286622
## Jul 2019 3.880000 3.529658 4.230342 3.344198 4.415801
##
## payems
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 101.0290 -53.62355 255.6815 -135.4917 337.5496
## May 2019 126.7408 -48.60800 302.0897 -141.4321 394.9137
## Jun 2019 113.3847 -87.37936 314.1488 -193.6574 420.4269
## Jul 2019 100.0482 -117.51908 317.6154 -232.6922 432.7886
##
## nasdaq
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 45.44990 -152.5305 243.4303 -257.3350 348.2348
## May 2019 36.73652 -180.7761 254.2491 -295.9203 369.3933
## Jun 2019 -24.07420 -243.9751 195.8267 -360.3836 312.2352
## Jul 2019 12.94950 -208.8434 234.7424 -326.2534 352.1524
##
## gas
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.044054657 -0.1355045 0.2236138 -0.2305573 0.3186667
## May 2019 -0.012304366 -0.2279236 0.2033149 -0.3420655 0.3174568
## Jun 2019 -0.004416892 -0.2241993 0.2153655 -0.3405450 0.3317113
## Jul 2019 -0.014093785 -0.2343284 0.2061408 -0.3509135 0.3227259
##
## cpi
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -0.53474505 -1.2678239 0.1983338 -1.655892 0.5864024
## May 2019 -0.24408589 -1.0313928 0.5432210 -1.448168 0.9599962
## Jun 2019 0.11397957 -0.7356153 0.9635744 -1.185364 1.4133228
## Jul 2019 -0.02829024 -0.8981828 0.8416024 -1.358676 1.3020957
##
## ffr
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.011984888 -0.1452179 0.1691876 -0.2284360 0.2524058
## May 2019 -0.003680842 -0.1711289 0.1637672 -0.2597706 0.2524089
## Jun 2019 -0.021510859 -0.1924420 0.1494203 -0.2829275 0.2399058
## Jul 2019 -0.010154719 -0.1828404 0.1625309 -0.2742546 0.2539452
var.3 = VAR(df_2, p = 3)
(var.3_forecast = forecast(var.3,4))
## unrate
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.776989 3.612503 3.941474 3.525430 4.028548
## May 2019 3.770655 3.555019 3.986292 3.440867 4.100443
## Jun 2019 3.825203 3.555116 4.095290 3.412141 4.238265
## Jul 2019 3.853711 3.517103 4.190319 3.338913 4.368509
##
## payems
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 133.2410 -18.82980 285.3119 -99.33123 365.8133
## May 2019 102.3208 -66.29431 270.9359 -155.55376 360.1954
## Jun 2019 133.3276 -55.00041 321.6556 -154.69523 421.3504
## Jul 2019 109.2348 -94.62420 313.0938 -202.54064 421.0102
##
## nasdaq
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -107.54269 -297.7351 82.64969 -398.4168 183.3315
## May 2019 72.33887 -141.0458 285.72354 -254.0048 398.6826
## Jun 2019 53.37743 -166.3732 273.12809 -282.7022 389.4571
## Jul 2019 -20.82912 -242.8418 201.18351 -360.3681 318.7099
##
## gas
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.027394896 -0.1528655 0.2076553 -0.2482896 0.3030794
## May 2019 -0.003603685 -0.2185786 0.2113712 -0.3323794 0.3251721
## Jun 2019 0.013418809 -0.2063837 0.2332213 -0.3227400 0.3495777
## Jul 2019 -0.014556712 -0.2359026 0.2067891 -0.3530760 0.3239626
##
## cpi
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -0.72422512 -1.4500014 0.001551141 -1.834204 0.3857539
## May 2019 -0.16313429 -0.9559253 0.629656697 -1.375603 1.0493349
## Jun 2019 0.16527297 -0.6753332 1.005879105 -1.120323 1.4508691
## Jul 2019 0.03227664 -0.8416474 0.906200655 -1.304275 1.3688281
##
## ffr
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.12956964 -0.02647686 0.28561614 -0.1090829 0.3682222
## May 2019 -0.03471638 -0.20339601 0.13396326 -0.2926896 0.2232569
## Jun 2019 -0.09469548 -0.26729609 0.07790513 -0.3586653 0.1692744
## Jul 2019 0.01826820 -0.15579024 0.19232664 -0.2479312 0.2844676
var.4 = VAR(df_2, p = 4)
(var.4_forecast = forecast(var.4,4))
## unrate
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.763723 3.600559 3.926887 3.514185 4.013261
## May 2019 3.740531 3.526110 3.954953 3.412602 4.068460
## Jun 2019 3.761494 3.485065 4.037924 3.338732 4.184256
## Jul 2019 3.821379 3.473623 4.169134 3.289532 4.353225
##
## payems
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 152.9755 1.722164 304.2288 -78.34650 384.2974
## May 2019 154.8804 -10.930315 320.6912 -98.70522 408.4661
## Jun 2019 136.9906 -47.726625 321.7079 -145.51002 419.4913
## Jul 2019 141.4516 -62.204926 345.1081 -170.01416 452.9173
##
## nasdaq
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -182.33785 -367.18609 2.510388 -465.0388 100.3631
## May 2019 -55.08436 -267.86856 157.699851 -380.5097 270.3410
## Jun 2019 173.69372 -46.80719 394.194629 -163.5333 510.9208
## Jul 2019 63.07739 -159.49972 285.654504 -277.3249 403.4797
##
## gas
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.046778737 -0.1323110 0.2258685 -0.2271154 0.3206729
## May 2019 0.021559216 -0.1917252 0.2348436 -0.3046311 0.3477495
## Jun 2019 0.059523504 -0.1583505 0.2773975 -0.2736860 0.3927330
## Jul 2019 -0.005752113 -0.2266422 0.2151379 -0.3435743 0.3320701
##
## cpi
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -0.5910216 -1.3112959 0.1292527 -1.692586 0.5105428
## May 2019 -0.1032851 -0.8862663 0.6796960 -1.300751 1.0941812
## Jun 2019 0.2194156 -0.6141956 1.0530268 -1.055483 1.4943140
## Jul 2019 -0.1192999 -0.9901350 0.7515353 -1.451127 1.2125276
##
## ffr
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.117871724 -0.03739561 0.27313906 -0.1195892 0.3553326
## May 2019 0.009755814 -0.15929074 0.17880237 -0.2487786 0.2682902
## Jun 2019 -0.115710495 -0.28983557 0.05841458 -0.3820118 0.1505908
## Jul 2019 -0.015134778 -0.19016232 0.15989276 -0.2828163 0.2525467
var.5 = VAR(df_2, p = 5)
(var.5_forecast = forecast(var.5,4))
## unrate
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 3.732872 3.571814 3.893930 3.486555 3.979189
## May 2019 3.739122 3.529462 3.948781 3.418475 4.059769
## Jun 2019 3.775919 3.501242 4.050597 3.355837 4.196002
## Jul 2019 3.775203 3.426469 4.123937 3.241861 4.308546
##
## payems
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 114.5014 -35.35119 264.3540 -114.67837 343.6812
## May 2019 194.5945 31.12072 358.0682 -55.41706 444.6060
## Jun 2019 231.0063 49.45302 412.5596 -46.65549 508.6681
## Jul 2019 174.7241 -23.48688 372.9350 -128.41342 477.8616
##
## nasdaq
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -202.54004 -382.11671 -22.96338 -477.1789 72.09877
## May 2019 -12.62012 -217.94721 192.70696 -326.6408 301.40055
## Jun 2019 164.87106 -48.80671 378.54882 -161.9209 491.66300
## Jul 2019 -29.59249 -247.37120 188.18621 -362.6563 303.47130
##
## gas
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.07616408 -0.10397631 0.2563045 -0.1993369 0.3516650
## May 2019 0.06727989 -0.14665707 0.2812169 -0.2599085 0.3944682
## Jun 2019 0.12513951 -0.09306441 0.3433434 -0.2085746 0.4588536
## Jul 2019 0.02029685 -0.20059954 0.2411933 -0.3175350 0.3581287
##
## cpi
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 -0.30209427 -1.0181391 0.4139506 -1.3971904 0.7930019
## May 2019 -0.03129378 -0.8217219 0.7591344 -1.2401493 1.1775617
## Jun 2019 0.36568376 -0.4781720 1.2095395 -0.9248823 1.6562498
## Jul 2019 -0.27216249 -1.1469277 0.6026027 -1.6100004 1.0656754
##
## ffr
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Apr 2019 0.14598181 0.0001736102 0.29179001 -0.07701259 0.3689762
## May 2019 0.02253829 -0.1384181533 0.18349473 -0.22362335 0.2686999
## Jun 2019 -0.08585673 -0.2516085930 0.07989514 -0.33935233 0.1676389
## Jul 2019 -0.02177301 -0.1907322407 0.14718621 -0.28017385 0.2366278
# Evaluating VAR(1-5) (Unrate, Payems, Nasdaq, Gas, CPI, FFR)
AIC(var.1)
## [1] 5062.369
AIC(var.2)
## [1] 4950.203
AIC(var.3)
## [1] 4925.694
AIC(var.4)
## [1] 4906.418
AIC(var.5)
## [1] 4863.024
BIC(var.1)
## [1] 5205.281
BIC(var.2)
## [1] 5215.259
BIC(var.3)
## [1] 5312.568
BIC(var.4)
## [1] 5414.778
BIC(var.5)
## [1] 5492.54
checkresiduals(var.1$varresult$unrate)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 23.276, df = 10, p-value = 0.009773
checkresiduals(var.2$varresult$unrate)
##
## Breusch-Godfrey test for serial correlation of order up to 16
##
## data: Residuals
## LM test = 18.854, df = 16, p-value = 0.2763
checkresiduals(var.3$varresult$unrate)
##
## Breusch-Godfrey test for serial correlation of order up to 22
##
## data: Residuals
## LM test = 25.883, df = 22, p-value = 0.2567
checkresiduals(var.4$varresult$unrate)
##
## Breusch-Godfrey test for serial correlation of order up to 28
##
## data: Residuals
## LM test = 39.768, df = 28, p-value = 0.06934
checkresiduals(var.5$varresult$unrate)
##
## Breusch-Godfrey test for serial correlation of order up to 34
##
## data: Residuals
## LM test = 46.444, df = 34, p-value = 0.07562
checkresiduals(var.1$varresult$payems)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 44.164, df = 10, p-value = 3.076e-06
checkresiduals(var.2$varresult$payems)
##
## Breusch-Godfrey test for serial correlation of order up to 16
##
## data: Residuals
## LM test = 24.803, df = 16, p-value = 0.07337
checkresiduals(var.3$varresult$payems)
##
## Breusch-Godfrey test for serial correlation of order up to 22
##
## data: Residuals
## LM test = 35.064, df = 22, p-value = 0.03815
checkresiduals(var.4$varresult$payems)
##
## Breusch-Godfrey test for serial correlation of order up to 28
##
## data: Residuals
## LM test = 50.27, df = 28, p-value = 0.006034
checkresiduals(var.5$varresult$payems)
##
## Breusch-Godfrey test for serial correlation of order up to 34
##
## data: Residuals
## LM test = 44.009, df = 34, p-value = 0.1169
# Plots
autoplot(subset(unrate, start = 800), series = "Observed") + autolayer(var.1_forecast$forecast$unrate, series = "VAR(1)", PI = FALSE) + autolayer(var.2_forecast$forecast$unrate, series = "VAR(2)", PI = FALSE) + autolayer(var.3_forecast$forecast$unrate, series = "VAR(3)", PI = FALSE) + autolayer(var.4_forecast$forecast$unrate, series = "VAR(4)", PI = FALSE) + autolayer(var.5_forecast$forecast$unrate, series = "VAR(5)", PI = FALSE) + ggtitle("Forecasts for Unemployment Rate") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Unemployment Rate (%)")
autoplot(subset(payems, start = 800), series = "Observed") + autolayer(var.1_forecast$forecast$payems, series = "VAR(1)", PI = FALSE) + autolayer(var.2_forecast$forecast$payems, series = "VAR(2)", PI = FALSE) + autolayer(var.3_forecast$forecast$payems, series = "VAR(3)", PI = FALSE) + autolayer(var.4_forecast$forecast$payems, series = "VAR(4)", PI = FALSE) + autolayer(var.5_forecast$forecast$payems, series = "VAR(5)", PI = FALSE) + ggtitle("Forecasts for Changes in Nonfarm Payroll Employment") + guides(colour=guide_legend(title="Series and Forecasts")) + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)")
Residual plots and AIC/BIC values look similar across the board - so I will include all forecasts in my final average.
Multivariate Forecasts - VAR (5) Using Penalized ERM
# unrate
# Unrate
a = glmnet(df_2[,c(2:6)],df_2[,1],family="gaussian")
predict(var.5,s=a$lambda)$fcst$unrate
## fcst lower upper CI
## [1,] 3.732872 3.486555 3.979189 0.2463172
## [2,] 3.739122 3.418475 4.059769 0.3206470
## [3,] 3.775919 3.355837 4.196002 0.4200826
## [4,] 3.775203 3.241861 4.308546 0.5333426
## [5,] 3.730270 3.086354 4.374186 0.6439159
## [6,] 3.707339 2.942221 4.472457 0.7651178
## [7,] 3.712405 2.825655 4.599155 0.8867498
## [8,] 3.761736 2.748961 4.774510 1.0127743
## [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
b = glmnet(df_2[,c(2:6)],df_2[,1],family="mgaussian")
predict(var.5,s=b$lambda)$fcst$unrate
## fcst lower upper CI
## [1,] 3.732872 3.486555 3.979189 0.2463172
## [2,] 3.739122 3.418475 4.059769 0.3206470
## [3,] 3.775919 3.355837 4.196002 0.4200826
## [4,] 3.775203 3.241861 4.308546 0.5333426
## [5,] 3.730270 3.086354 4.374186 0.6439159
## [6,] 3.707339 2.942221 4.472457 0.7651178
## [7,] 3.712405 2.825655 4.599155 0.8867498
## [8,] 3.761736 2.748961 4.774510 1.0127743
## [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
c = glmnet(df_2[,c(2:6)],df_2[,1],family="gaussian", alpha = 0)
predict(var.5,s=c$lambda)$fcst$unrate
## fcst lower upper CI
## [1,] 3.732872 3.486555 3.979189 0.2463172
## [2,] 3.739122 3.418475 4.059769 0.3206470
## [3,] 3.775919 3.355837 4.196002 0.4200826
## [4,] 3.775203 3.241861 4.308546 0.5333426
## [5,] 3.730270 3.086354 4.374186 0.6439159
## [6,] 3.707339 2.942221 4.472457 0.7651178
## [7,] 3.712405 2.825655 4.599155 0.8867498
## [8,] 3.761736 2.748961 4.774510 1.0127743
## [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
d = glmnet(df_2[,c(2:6)],df_2[,1],family="mgaussian", alpha = 0)
predict(var.5,s=d$lambda)$fcst$unrate
## fcst lower upper CI
## [1,] 3.732872 3.486555 3.979189 0.2463172
## [2,] 3.739122 3.418475 4.059769 0.3206470
## [3,] 3.775919 3.355837 4.196002 0.4200826
## [4,] 3.775203 3.241861 4.308546 0.5333426
## [5,] 3.730270 3.086354 4.374186 0.6439159
## [6,] 3.707339 2.942221 4.472457 0.7651178
## [7,] 3.712405 2.825655 4.599155 0.8867498
## [8,] 3.761736 2.748961 4.774510 1.0127743
## [9,] 3.816069 2.676091 4.956048 1.1399782
## [10,] 3.851799 2.586725 5.116873 1.2650736
# Payems
a = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="gaussian")
predict(var.5,s=a$lambda)$fcst$payems
## fcst lower upper CI
## [1,] 114.50142 -114.67837 343.6812 229.1798
## [2,] 194.59447 -55.41706 444.6060 250.0115
## [3,] 231.00633 -46.65549 508.6681 277.6618
## [4,] 174.72407 -128.41342 477.8616 303.1375
## [5,] 146.18593 -178.22388 470.5957 324.4098
## [6,] 100.65108 -244.37474 445.6769 345.0258
## [7,] 102.56050 -256.12452 461.2455 358.6850
## [8,] 104.99433 -263.46527 473.4539 368.4596
## [9,] 83.20564 -292.53628 458.9476 375.7419
## [10,] 67.29079 -316.18083 450.7624 383.4716
b = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="mgaussian")
predict(var.5,s=b$lambda)$fcst$payems
## fcst lower upper CI
## [1,] 114.50142 -114.67837 343.6812 229.1798
## [2,] 194.59447 -55.41706 444.6060 250.0115
## [3,] 231.00633 -46.65549 508.6681 277.6618
## [4,] 174.72407 -128.41342 477.8616 303.1375
## [5,] 146.18593 -178.22388 470.5957 324.4098
## [6,] 100.65108 -244.37474 445.6769 345.0258
## [7,] 102.56050 -256.12452 461.2455 358.6850
## [8,] 104.99433 -263.46527 473.4539 368.4596
## [9,] 83.20564 -292.53628 458.9476 375.7419
## [10,] 67.29079 -316.18083 450.7624 383.4716
c = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="gaussian", alpha = 0)
predict(var.5,s=c$lambda)$fcst$payems
## fcst lower upper CI
## [1,] 114.50142 -114.67837 343.6812 229.1798
## [2,] 194.59447 -55.41706 444.6060 250.0115
## [3,] 231.00633 -46.65549 508.6681 277.6618
## [4,] 174.72407 -128.41342 477.8616 303.1375
## [5,] 146.18593 -178.22388 470.5957 324.4098
## [6,] 100.65108 -244.37474 445.6769 345.0258
## [7,] 102.56050 -256.12452 461.2455 358.6850
## [8,] 104.99433 -263.46527 473.4539 368.4596
## [9,] 83.20564 -292.53628 458.9476 375.7419
## [10,] 67.29079 -316.18083 450.7624 383.4716
d = glmnet(df_2[,c(1,3,4,5,6)],df_2[,2],family="mgaussian", alpha = 0)
predict(var.5,s=d$lambda)$fcst$payems
## fcst lower upper CI
## [1,] 114.50142 -114.67837 343.6812 229.1798
## [2,] 194.59447 -55.41706 444.6060 250.0115
## [3,] 231.00633 -46.65549 508.6681 277.6618
## [4,] 174.72407 -128.41342 477.8616 303.1375
## [5,] 146.18593 -178.22388 470.5957 324.4098
## [6,] 100.65108 -244.37474 445.6769 345.0258
## [7,] 102.56050 -256.12452 461.2455 358.6850
## [8,] 104.99433 -263.46527 473.4539 368.4596
## [9,] 83.20564 -292.53628 458.9476 375.7419
## [10,] 67.29079 -316.18083 450.7624 383.4716
Bayesian Methods: Autoregression (6) using Laplace Priors
# unrate
unratelags<-data.frame(
window(unrate,start=c(1948,7),end=c(2019,3)),
window(stats::lag(unrate,-1),start=c(1948,7),end=c(2019,3)),
window(stats::lag(unrate,-2),start=c(1948,7),end=c(2019,3)),
window(stats::lag(unrate,-3),start=c(1948,7),end=c(2019,3)),
window(stats::lag(unrate,-4),start=c(1948,7),end=c(2019,3)),
window(stats::lag(unrate,-5),start=c(1948,7),end=c(2019,3)),
window(stats::lag(unrate,-6),start=c(1948,7),end=c(2019,3)))
colnames(unratelags)<-c("unrate","unratel1","unratel2","unratel3","unratel4","unratel5", "unratel6")
untoday<-length(unratelags$unrate) #Last observation
untodayslags<-data.frame(unratelags$unrate[untoday],
unratelags$unratel1[untoday],unratelags$unratel2[untoday],
unratelags$unratel3[untoday],unratelags$unratel4[untoday],
unratelags$unratel5[untoday],unratelags$unratel6[untoday])
names(untodayslags)<-c("unratel1","unratel2","unratel3","unratel4","unratel5", "unratel6")
options(mc.cores = parallel::detectCores())
AR6_unrate_c = stan_glm(unrate ~ unratel1 + unratel2 + unratel3 + unratel4 + unratel5 + unratel6, data = unratelags, prior = laplace())
summary(AR6_unrate_c, digits = 5)
##
## Model Info:
##
## function: stan_glm
## family: gaussian [identity]
## formula: unrate ~ unratel1 + unratel2 + unratel3 + unratel4 + unratel5 +
## unratel6
## algorithm: sampling
## priors: see help('prior_summary')
## sample: 4000 (posterior sample size)
## observations: 849
## predictors: 7
##
## Estimates:
## mean sd 2.5% 25% 50% 75%
## (Intercept) 0.10092 0.02431 0.05374 0.08468 0.10065 0.11684
## unratel1 0.98959 0.03352 0.92614 0.96687 0.98900 1.01204
## unratel2 0.21621 0.04690 0.12508 0.18412 0.21638 0.24907
## unratel3 -0.06053 0.04735 -0.15218 -0.09354 -0.06086 -0.02887
## unratel4 -0.06263 0.04699 -0.15321 -0.09532 -0.06324 -0.03038
## unratel5 0.02284 0.04603 -0.06607 -0.00792 0.02203 0.05327
## unratel6 -0.12296 0.03312 -0.18919 -0.14475 -0.12360 -0.10020
## sigma 0.19244 0.00462 0.18381 0.18917 0.19233 0.19549
## mean_PPD 5.77068 0.00943 5.75230 5.76444 5.77046 5.77701
## log-posterior 169.88249 4.01246 161.01179 167.36055 170.35050 172.79960
## 97.5%
## (Intercept) 0.15024
## unratel1 1.05546
## unratel2 0.30608
## unratel3 0.03303
## unratel4 0.03033
## unratel5 0.11476
## unratel6 -0.05885
## sigma 0.20181
## mean_PPD 5.78938
## log-posterior 176.36728
##
## Diagnostics:
## mcse Rhat n_eff
## (Intercept) 0.00038 0.99957 4079
## unratel1 0.00054 0.99946 3891
## unratel2 0.00077 0.99960 3754
## unratel3 0.00079 0.99938 3553
## unratel4 0.00085 0.99995 3092
## unratel5 0.00106 1.00051 1871
## unratel6 0.00058 1.00022 3215
## sigma 0.00008 1.00318 3138
## mean_PPD 0.00016 1.00149 3357
## log-posterior 0.16104 1.00470 621
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
predict_AR6_unrate_c = posterior_predict(AR6_unrate_c, newdata = untodayslags)
summary(predict_AR6_unrate_c)
## 1
## Min. :3.131
## 1st Qu.:3.684
## Median :3.814
## Mean :3.813
## 3rd Qu.:3.942
## Max. :4.527
quantile(predict_AR6_unrate_c, c(0.025, 0.975))
## 2.5% 97.5%
## 3.438197 4.192865
# Evaluation
summary(AR6_unrate_c, digits = 5)
##
## Model Info:
##
## function: stan_glm
## family: gaussian [identity]
## formula: unrate ~ unratel1 + unratel2 + unratel3 + unratel4 + unratel5 +
## unratel6
## algorithm: sampling
## priors: see help('prior_summary')
## sample: 4000 (posterior sample size)
## observations: 849
## predictors: 7
##
## Estimates:
## mean sd 2.5% 25% 50% 75%
## (Intercept) 0.10092 0.02431 0.05374 0.08468 0.10065 0.11684
## unratel1 0.98959 0.03352 0.92614 0.96687 0.98900 1.01204
## unratel2 0.21621 0.04690 0.12508 0.18412 0.21638 0.24907
## unratel3 -0.06053 0.04735 -0.15218 -0.09354 -0.06086 -0.02887
## unratel4 -0.06263 0.04699 -0.15321 -0.09532 -0.06324 -0.03038
## unratel5 0.02284 0.04603 -0.06607 -0.00792 0.02203 0.05327
## unratel6 -0.12296 0.03312 -0.18919 -0.14475 -0.12360 -0.10020
## sigma 0.19244 0.00462 0.18381 0.18917 0.19233 0.19549
## mean_PPD 5.77068 0.00943 5.75230 5.76444 5.77046 5.77701
## log-posterior 169.88249 4.01246 161.01179 167.36055 170.35050 172.79960
## 97.5%
## (Intercept) 0.15024
## unratel1 1.05546
## unratel2 0.30608
## unratel3 0.03303
## unratel4 0.03033
## unratel5 0.11476
## unratel6 -0.05885
## sigma 0.20181
## mean_PPD 5.78938
## log-posterior 176.36728
##
## Diagnostics:
## mcse Rhat n_eff
## (Intercept) 0.00038 0.99957 4079
## unratel1 0.00054 0.99946 3891
## unratel2 0.00077 0.99960 3754
## unratel3 0.00079 0.99938 3553
## unratel4 0.00085 0.99995 3092
## unratel5 0.00106 1.00051 1871
## unratel6 0.00058 1.00022 3215
## sigma 0.00008 1.00318 3138
## mean_PPD 0.00016 1.00149 3357
## log-posterior 0.16104 1.00470 621
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
prior_summary(AR6_unrate_c)
## Priors for model 'AR6_unrate_c'
## ------
## Intercept (after predictors centered)
## ~ normal(location = 0, scale = 10)
## **adjusted scale = 16.36
##
## Coefficients
## ~ laplace(location = [0,0,0,...], scale = [2.5,2.5,2.5,...])
## **adjusted scale = [2.50,2.50,2.50,...]
##
## Auxiliary (sigma)
## ~ exponential(rate = 1)
## **adjusted scale = 1.64 (adjusted rate = 1/adjusted scale)
## ------
## See help('prior_summary.stanreg') for more details
hist(predict_AR6_unrate_c)
autoplot(subset(unrate, start = 800), color = "blue") + ggtitle("Forecast for Unemployment Rate") + labs(x = "Date", y = "Unemployment Rate (%)") + geom_point(aes(x = 2019.33, y = 3.889), color = "red")
# payems
payemslags<-data.frame(
window(payems,start=c(1939,7),end=c(2019,3)),
window(stats::lag(payems,-1),start=c(1939,7),end=c(2019,3)),
window(stats::lag(payems,-2),start=c(1939,7),end=c(2019,3)),
window(stats::lag(payems,-3),start=c(1939,7),end=c(2019,3)),
window(stats::lag(payems,-4),start=c(1939,7),end=c(2019,3)),
window(stats::lag(payems,-5),start=c(1939,7),end=c(2019,3)),
window(stats::lag(payems,-6),start=c(1939,7),end=c(2019,3)))
colnames(payemslags)<-c("payems","payemsl1","payemsl2","payemsl3","payemsl4","payemsl5", "payemsl6")
ptoday<-length(payemslags$payems) #Last observation
ptodayslags<-data.frame(payemslags$payems[ptoday],payemslags$payemsl1[ptoday],
payemslags$payemsl2[ptoday],payemslags$payemsl3[ptoday],payemslags$payemsl4[ptoday], payemslags$payemsl5[ptoday], payemslags$payemsl6[ptoday])
names(ptodayslags)<-c("payemsl1","payemsl2","payemsl3","payemsl4","payemsl5", "payemsl6")
options(mc.cores = parallel::detectCores())
AR6_payems_c = stan_glm(payems ~ payemsl1 + payemsl2 + payemsl3 + payemsl4 + payemsl5 + payemsl6, data = payemslags, prior = laplace())
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.95 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
summary(AR6_payems_c, digits = 5)
##
## Model Info:
##
## function: stan_glm
## family: gaussian [identity]
## formula: payems ~ payemsl1 + payemsl2 + payemsl3 + payemsl4 + payemsl5 +
## payemsl6
## algorithm: sampling
## priors: see help('prior_summary')
## sample: 4000 (posterior sample size)
## observations: 956
## predictors: 7
##
## Estimates:
## mean sd 2.5% 25% 50%
## (Intercept) 28.74168 7.00689 14.89944 24.00002 28.76728
## payemsl1 0.25949 0.03204 0.19540 0.23805 0.25952
## payemsl2 0.26353 0.03274 0.19886 0.24152 0.26331
## payemsl3 0.14192 0.03404 0.07374 0.11896 0.14200
## payemsl4 0.08597 0.03394 0.01815 0.06299 0.08554
## payemsl5 0.10459 0.03235 0.04077 0.08274 0.10425
## payemsl6 -0.08256 0.03147 -0.14227 -0.10408 -0.08260
## sigma 174.42756 4.05059 166.74347 171.64175 174.45065
## mean_PPD 125.86869 8.12384 109.62648 120.35179 125.93586
## log-posterior -6319.22224 3.84123 -6327.82950 -6321.54233 -6318.78322
## 75% 97.5%
## (Intercept) 33.47942 42.35270
## payemsl1 0.28166 0.32055
## payemsl2 0.28576 0.32836
## payemsl3 0.16502 0.20880
## payemsl4 0.10853 0.15459
## payemsl5 0.12707 0.16712
## payemsl6 -0.06139 -0.01968
## sigma 177.14631 182.40338
## mean_PPD 131.40612 141.87530
## log-posterior -6316.44939 -6312.81859
##
## Diagnostics:
## mcse Rhat n_eff
## (Intercept) 0.12960 0.99941 2923
## payemsl1 0.00050 1.00059 4174
## payemsl2 0.00050 1.00010 4310
## payemsl3 0.00054 0.99983 4043
## payemsl4 0.00052 0.99958 4194
## payemsl5 0.00052 0.99995 3854
## payemsl6 0.00050 0.99947 3970
## sigma 0.07102 0.99937 3253
## mean_PPD 0.14206 0.99952 3270
## log-posterior 0.17967 1.00225 457
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
predict_AR6_payems_c = posterior_predict(AR6_payems_c, newdata = ptodayslags)
summary(predict_AR6_payems_c)
## 1
## Min. :-486.59
## 1st Qu.: 27.31
## Median : 149.74
## Mean : 147.06
## 3rd Qu.: 266.10
## Max. : 799.90
quantile(predict_AR6_payems_c, c(0.025, 0.975))
## 2.5% 97.5%
## -187.9131 475.8982
autoplot(subset(payems, start = 800), color = "blue") + ggtitle("Forecast for Changes in Nonfarm Payroll Employment") + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)") + geom_point(aes(x = 2019.33, y = 194.10), color = "red")
Machine Learning Procedures: Tree, Random Forrest, and Boosting
For my ML methods, I chose to create a dataset containing the lagged values of unrate/payems up to the 6th lag value, and also added the variables that I included in my VAR model earlier, which have been proven to be related to employment.
# Adopted from Professor Childer's code
# Unrate
df_2 = data.frame(na.omit(cbind(unrate, stats::lag(unrate, -1), stats::lag(unrate, -2), stats::lag(unrate, -3), stats::lag(unrate, -4), stats::lag(unrate, -5), stats::lag(unrate, -6), stats::lag(payems, -1), stats::lag(nasdaq,-1), stats::lag(gas,-1), stats::lag(cpi,-1), stats::lag(ffr,-1))))
colnames(df_2) = c("unrate", "unratelag1", "unratelag2", "unratelag3", "unratelag4", "unratelag5", "unratelag6", "payems", "nasdaq", "gas", "cpi", "ffr")
set.seed(998)
inTraining <- createDataPartition(df_2$unrate, p = .75, list = FALSE)
training <- df_2[ inTraining,]
testing <- df_2[-inTraining,]
rpfit <- rpart(unrate ~ ., data = training)
fitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 10)
rpfit1 <- train(unrate ~ ., data = training,
method = "rpart",
trControl = fitControl,
na.action = na.exclude)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
forestControl <- trainControl(method = "none")
rffit1 <- train(unrate ~ ., data = training,
method = "rf",
trControl = forestControl,
na.action = na.exclude)
splitspertree<-rffit1$finalModel$mtry
numbertrees<-rffit1$finalModel$ntree
xgbfitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 10)
xgbfit1 <- train(unrate ~ ., data = training,
method = "xgbTree",
trControl = xgbfitControl,
na.action = na.exclude)
cvparam <- list(max_depth = 1, eta = 0.3, gamma=0, colsample_bytree=0.8,
min_child_weight=1, subsample=1, objective = "reg:linear", eval_metric = "rmse")
trainfeat<-select(training,-one_of("unrate"))
testfeat<-select(testing,-one_of("unrate"))
dtrain <- xgb.DMatrix(as.matrix(trainfeat), label=training$unrate)
dtest <- xgb.DMatrix(as.matrix(testfeat), label = testing$unrate)
watchlist <- list(train = dtrain, eval = dtest)
bst <- xgb.train(params=cvparam, data=dtrain,verbose=0,nrounds=50,watchlist=watchlist)
# Forecasts
rppreds<-predict(rpfit1$finalModel, newdata = testing) #Decision Tree
rfpreds<-predict(rffit1$finalModel, newdata = na.roughfix(testing)) #Random Forest
bstpreds<-predict(bst,newdata=as.matrix(testfeat)) #Tree Boosting
# Evaluation
prederrors<-data.frame((rppreds-testing$unrate)^2,(rfpreds-testing$unrate)^2,
(bstpreds-testing$unrate)^2)
MSEvec<-colMeans(prederrors,na.rm=TRUE)
TestRMSE<-sqrt(MSEvec)
rprmse<-rpfit1$results$RMSE[1]
rfrmse<-sqrt(rffit1$finalModel$mse[500])
bstrmse<-bst$evaluation_log$train_rmse[50]
TrainRMSE<-c(rprmse,rfrmse,bstrmse)
resmat<-as.matrix(data.frame(TestRMSE,TrainRMSE))
fitresults<-data.frame(t(resmat))
colnames(fitresults)<-c("Tree","Random Forest","Boosting")
fitresults
## Tree Random Forest Boosting
## TestRMSE 0.5035231 0.2740606 0.2070531
## TrainRMSE 0.5533178 0.2469407 0.1537530
autoplot(subset(unrate, start = 800), color = "blue") + ggtitle("Forecast for Unemployment Rate") + labs(x = "Date", y = "Unemployment Rate (%)") + geom_point(aes(x = 2019.33, y = 4.528169), color = "red") + geom_point(aes(x = 2019.33, y = 3.864400), color = "magenta") + geom_point(aes(x = 2019.33, y = 3.854172), color = "darkgreen")
# Adopted from Professor Childer's code
# Payems
df_2 = data.frame(na.omit(cbind(payems, stats::lag(payems, -1), stats::lag(payems, -2), stats::lag(payems, -3), stats::lag(payems, -4), stats::lag(payems, -5), stats::lag(payems, -6), stats::lag(unrate, -1), stats::lag(nasdaq,-1), stats::lag(gas,-1), stats::lag(cpi,-1), stats::lag(ffr,-1))))
colnames(df_2) = c("payems", "payemslag1", "payemslag2", "payemslag3", "payemslag4", "payemslag5", "payemslag6", "unrate", "nasdaq", "gas", "cpi", "ffr")
set.seed(998)
inTraining <- createDataPartition(df_2$payems, p = .75, list = FALSE)
training <- df_2[ inTraining,]
testing <- df_2[-inTraining,]
rpfit <- rpart(payems ~ ., data = training)
fitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 10)
rpfit1 <- train(payems ~ ., data = training,
method = "rpart",
trControl = fitControl,
na.action = na.exclude)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
rffit1 <- train(payems ~ ., data = training,
method = "rf",
trControl = forestControl,
na.action = na.exclude)
splitspertree<-rffit1$finalModel$mtry
numbertrees<-rffit1$finalModel$ntree
xgbfitControl <- trainControl(
method = "repeatedcv",
number = 10,
repeats = 10)
xgbfit1 <- train(payems ~ ., data = training,
method = "xgbTree",
trControl = xgbfitControl,
na.action = na.exclude)
cvparam <- list(max_depth = 1, eta = 0.3, gamma=0, colsample_bytree=0.8,
min_child_weight=1, subsample=1, objective = "reg:linear", eval_metric = "rmse")
trainfeat<-select(training,-one_of("payems"))
testfeat<-select(testing,-one_of("payems"))
dtrain <- xgb.DMatrix(as.matrix(trainfeat), label=training$payems)
dtest <- xgb.DMatrix(as.matrix(testfeat), label = testing$payems)
watchlist <- list(train = dtrain, eval = dtest)
bst <- xgb.train(params=cvparam, data=dtrain,verbose=0,nrounds=50,watchlist=watchlist)
# Forecasts
rppreds<-predict(rpfit1$finalModel, newdata = testing) #Decision Tree
rfpreds<-predict(rffit1$finalModel, newdata = na.roughfix(testing)) #Random Forest
bstpreds<-predict(bst,newdata=as.matrix(testfeat)) #Tree Boosting
# Evaluation
prederrors<-data.frame((rppreds-testing$payems)^2,(rfpreds-testing$payems)^2,
(bstpreds-testing$payems)^2)
MSEvec<-colMeans(prederrors,na.rm=TRUE)
TestRMSE<-sqrt(MSEvec)
rprmse<-rpfit1$results$RMSE[1]
rfrmse<-sqrt(rffit1$finalModel$mse[500])
bstrmse<-bst$evaluation_log$train_rmse[50]
TrainRMSE<-c(rprmse,rfrmse,bstrmse)
resmat<-as.matrix(data.frame(TestRMSE,TrainRMSE))
fitresults<-data.frame(t(resmat))
colnames(fitresults)<-c("Tree","Random Forest","Boosting")
fitresults
## Tree Random Forest Boosting
## TestRMSE 150.6497 121.0619 137.79966
## TrainRMSE 158.8773 119.6887 82.91631
autoplot(subset(payems, start = 800), color = "blue") + ggtitle("Forecast for Changes in Nonfarm Payroll Employment") + labs(x = "Date", y = "Change in Employment (100's of thousands of persons)") + geom_point(aes(x = 2019.33, y = 146.7534), color = "red") + geom_point(aes(x = 2019.33, y = 213.272733), color = "magenta") + geom_point(aes(x = 2019.33, y = 246.33800), color = "darkgreen")
For both series, the tree method seems to perform substantially worse than random forrest and boosting, so I will remove the tree forecasts from my final average.
For my first ever forecast (for January), I relied heavily on expert judgement and ended up doing quite poorly, and so for my second forecast (for February), I decided to equally-weight model averages, and I definitely improved my estimates, but I was still not as accurate as I felt I could be, and I realized a potentially reason for this was that I was not selective enough in my model selection - I felt that models were more similar than others in terms of diagnostics/loss measures and so I decided to keep the majority of them them in my final averaging, when perhaps being a little more selective could’ve given me better estimates. So, for my third forecast (for March), I also used an equal-weighted average but was much more selective this time, and I ended up performing quite well. Thus, I decided to carry out the same method for my final forecast, and made sure to be equally if not more selective and I also made an effort to guage the opinion of experts when formulating my final forecast.
After evaluating the baseline methods and removing forecasts that performed substantially worse in regards to diagnostics and RMSE, I decided to average the remaining forecasts to create my final forecast. The reasons that I decided to go with an equal-weight model average are the following: this method minimizes risk, reduces chance of over fitting as it is not not dependent on the data, improves square loss risk, and if all methods are performing somewhat similarily, which I feel like has been the case for most forecasts for the two series’ we have been dealing with, it gives the chance for all methods to contribute to the forecast.
For unrate, I removed the AR(1), AR(2), and tree forecasts as they underperformed compared to other methods - this resulted in a forecast of 3.79%.
For payems, I also removed the AR(1), AR(2), and tree forecasts as they underpeformed compared to other methods. Because nonfarm payrolls growth has been quite volatile recently, I decided to research how experts in the industry are expecting April’s numbers to look like and found that estimates were in the 170k-180k range, which is substantially higher than some of my estimates. I thus decided to remove two additional forecasts: VAR(2) and Var(5), which estimated the growth to be 101k and 114k respectively. Removing the necessary estimates resulted in a forecast of 170k.